##########################################################################################
## Unveiling the geography of historical patents in the United States from 1836 to 1975 ##
##                        Replication Example                                           ##
##########################################################################################


# Installing and Loading Packages

install.packages('tm')
install.packages('stringi')
install.packages('data.table')
install.packages('plyr')
install.packages('RSNNS')
install.packages('neuralnet')
install.packages('kknn')

library(data.table)
library(tm)
library(stringi)
library(plyr)
library(RSNNS)
library(neuralnet)
library(kknn)

load("Example.RData")

ls()


##############
### Step 1 ### 
##############

## 1.a Trimming Documents

length(OCR.Documents)
OCR.Documents[10]

# 1.b Preparing city and state names to look for

head(City.Names)

Cities2Search=as.character(unique(City.Names$City))
Cities2Search=paste(' ',Cities2Search,' ',sep='')

States2Search=as.character(unique(State.Names$Name))
States2Search=paste(' ',States2Search,' ',sep='')

StatesAbb2Search=as.character(unique(State.Names$Abbreviation))


# 1.c Look for state names within the documents

Retrieved.States.Names=lapply(OCR.Documents,function(y) States2Search[stri_detect_fixed(y,States2Search)])
Retrieved.States.Abbreviations=lapply(OCR.Documents,function(y) StatesAbb2Search[stri_detect_fixed(y,StatesAbb2Search)])

head(Retrieved.States.Names)

Retrieved.States.Names=lapply(Retrieved.States.Names, function(y) as.character(State.Names$State.Code[match(gsub("^\\s+|\\s+$", "", y),State.Names$Name)]))
Retrieved.States.Abbreviations=lapply(Retrieved.States.Abbreviations, function(y) as.character(State.Names$State.Code[match(y,State.Names$Abbreviation)]))

State.Matches=mapply(c, Retrieved.States.Names, Retrieved.States.Abbreviations, SIMPLIFY=FALSE)
head(State.Matches)


# 1.d Look for city names within the documents (It may take several minutes)

City.Matches=lapply(OCR.Documents,function(y) Cities2Search[stri_detect_fixed(y,Cities2Search,case_insensitive=F)] )
head(City.Matches)
 
head(Inventors)
head(Assignees)

# 1.d We now create a database with characteristics of each located state name

I=sapply(State.Matches,length)!=0 
summary(I)

List=mapply(function(x,y,r,z,t) list(x,y,r,z,t),OCR.Documents[I],State.Matches[I],paste(Assignees$Original.Assignee[I],Assignees$Original.Assignee.E[I],sep=','),paste(Inventors$Inventors[I],Inventors$Inventors.E[I],sep=','),Patents$Publication.number[I],SIMPLIFY = FALSE)
List[[1]]

Keywords=c('COMPANY','CORPORATION','ASSIGNOR','INVENTOR')
Cutoff=c(' FIG ',' FIGURE ','REFERENCES CITED')


State.Data=lapply(List,function(z) {
  
  # Each (z) corresponds to a document, where the first element is the description, the
  # second the detected state, and so on.
  
  
  # How many times the state was found?
  Data=data.frame(count(stri_trans_toupper(z[[2]])),stringsAsFactors=F)
  colnames(Data)=c('State','Frequency')
  Data$State=as.character(Data$State)

  # Finding the state names
  temp=lapply(Data$State,function(y) State.Names[stri_detect_fixed(State.Names$State.Code,y),1:2])
  
  # For each State we retrieve the position in the document, it can be found several times in the text.
  Location.S=lapply(temp,function(y) unlist(stri_locate_all_fixed(z[[1]],as.character(y),omit_no_match = T)))
  Min.Location=sapply(Location.S,function(y) min(y,na.rm=T))
  Max.Location=sapply(Location.S,function(y) max(y,na.rm=T))
  Mean.Location=sapply(Location.S,function(y) mean(y,na.rm=T))
  Median.Location=sapply(Location.S,function(y) median(y,na.rm=T))
  
  # Find the Location of important words
  Location=unlist(lapply(Keywords,function(y) unlist(stri_locate_all_fixed(z[[1]],y,omit_no_match = T))))
  Min.Keywords=min(Location,na.rm=T)
  Max.Keywords=max(Location,na.rm=T)
  Mean.Keywords=mean(Location,na.rm=T)
  Median.Keywords=median(Location,na.rm=T)
  
  # Find the Location of important words
  Location=unlist(lapply(Cutoff,function(y) unlist(stri_locate_all_fixed(z[[1]],y,omit_no_match = T))))
  Min.Cutoff=min(Location,na.rm=T)
  
  # Finding The Assignee location (first clean it )
  Assig=sapply(stri_split_fixed(z[[3]],','),function (x) gsub("^\\s+|\\s+$", "", as.character(stripWhitespace(removeWords(x,c(stri_trans_toupper(letters)))))))
  Assig=stripWhitespace(stri_replace_all_fixed(Assig, "[[:punct:]]", ""))

  Location=unlist(lapply(Assig,function(y) unlist(stri_locate_all_fixed(z[[1]],y,omit_no_match = T))))
  Min.Assignees=min(Location,na.rm=T)
  Max.Assignees=max(Location,na.rm=T)
  Mean.Assignees=mean(Location,na.rm=T)
  Median.Assignees=median(Location,na.rm=T)
  
  # Finding The Inventor location
  # First clean it
  
  Inv=sapply(stri_split_fixed(z[[4]],','),function (x) gsub("^\\s+|\\s+$", "", as.character(stripWhitespace(removeWords(x,c(stri_trans_toupper(letters),letters))))))
  Inv=stripWhitespace(stri_replace_all_fixed(Inv, "[[:punct:]]", ""))

  Location=unlist(lapply(Inv,function(y) unlist(stri_locate_all_fixed(z[[1]],y,omit_no_match = T))))
  Min.Inv=min(Location,na.rm=T)
  Max.Inv=max(Location,na.rm=T)
  Mean.Inv=mean(Location,na.rm=T)
  Median.Inv=median(Location,na.rm=T)
  
  # Now lets look for some important words, like County, resident, etc. 
  # And evaluate their distance  with respect to the located state name.
  State.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],' STATE ',omit_no_match = T)[[1]]),FUN='-'))
  County.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],' COUNTY ',omit_no_match = T)[[1]]),FUN='-'))
  City.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],' CITY ',omit_no_match = T)[[1]]),FUN='-'))
  Subject.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],'SUBJECT',omit_no_match = T)[[1]]),FUN='-'))
  Resident.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],c('RESIDENT','RESIDING'),omit_no_match = T)[[1]]),FUN='-'))
  BIKT.Locations=lapply(Location.S,function(y) outer(y,rowMeans(stri_locate_all_fixed(z[[1]],'BE IT KNOWN THAT',omit_no_match = T)[[1]]),FUN='-'))
  
  R.of.State=unlist(lapply(State.Locations,function(y) min(y[y>0])))
  L.of.State=unlist(lapply(State.Locations,function(y) max(y[y<0])))
  R.of.County=unlist(lapply(County.Locations,function(y) min(y[y>0])))
  L.of.County=unlist(lapply(County.Locations,function(y) max(y[y<0])))
  R.of.City=unlist(lapply(City.Locations,function(y) min(y[y>0])))
  L.of.City=unlist(lapply(City.Locations,function(y) max(y[y<0])))
  R.of.Res=unlist(lapply(Resident.Locations,function(y) min(y[y>0])))
  L.of.Res=unlist(lapply(Resident.Locations,function(y) max(y[y<0])))
  R.of.BIKT=unlist(lapply(BIKT.Locations,function(y) min(y[y>0])))
  L.of.BIKT=unlist(lapply(BIKT.Locations,function(y) max(y[y<0])))
  
  # Compile
  Data=data.frame(Data,Publication.number=z[[5]],Min.Cutoff,L.of.BIKT,R.of.BIKT,R.of.State,L.of.State,R.of.County,L.of.County,R.of.City,L.of.City,L.of.Res,R.of.Res,Min.Location,Max.Location,Mean.Location,Median.Location,Min.Keywords,Max.Keywords,Mean.Keywords,Median.Keywords,Min.Inv,Max.Inv,Mean.Inv,Median.Inv,Min.Assignees,Max.Assignees,Mean.Assignees,Median.Assignees)  
  
  return(Data)
  
})

State.Data=rbindlist(State.Data)

head(State.Data)

# 1.e Do the same for city names
head(City.Data)


# 1.f Now merge the data
City.State.Data=merge(data.frame(City.Data),data.frame(unique(City.Names[,c(1:2)])),by='City',allow.cartesian=TRUE)
colnames(State.Data)[-c(1,3)]=paste(colnames(State.Data)[-c(1,3)],'State',sep = '.')
City.State.Data=merge(City.State.Data,State.Data,by=c('Publication.number','State'),all.x=T)
head(City.State.Data)


##############
## Step 2 ####
##############

head(Sample)

BTW=function(x,l,u) {y=1*I(as.numeric(x)>l & as.numeric(x)<u)
y[is.na(y)]=0
return(y)
}


# 2.c Divide the Sample into training and an evaluation set, by patent number.
set.seed(1)
ID <- sample(1:2, length(unique(Sample$Publication.number)), replace = TRUE)
ID=ID[match(Sample$Publication.number,unique(Sample$Publication.number))]

SampleTR=Sample[ID==1,]
SampleTS=Sample[ID==2,]


# 2.d Estimating a Probit Model
Probit=glm(Correct~poly(Min.Location,5,raw=T)+BTW(L.of.STREET,-2,0)+I(BTW(Min.Location-Min.Location.State,50,10000) | BTW(Min.Location.State-Min.Location,50,10000))+I(Min.Countries!='Inf')+BTW(Min.Countries-Min.Location,0,15)+I(BTW(Min.Location.State-Min.Cutoff,0,10000)==1 & I(Min.Location.State/Length)>0.5)+I(BTW(Min.Location-Min.Cutoff,0,10000)==1 & I(Min.Location/Length)>0.5)+COC+BTW(R.of.Corporation,0,10)+BTW(R.of.State,0,10)+ Country.Name+SUBSTRING+factor(nchar(City))+BTW(Min.Location.State-Min.Location,0,10)+BTW(Min.Location-Min.Location.State,0,50)+I(BTW(Min.Location.State-Min.Location,0,10) & I(Min.Location/Length)>0.5)+I(BTW(Min.Location-Min.Location.State,0,50) & I(Min.Location/Length)>0.5)+Detected.Name+Discard+is.na(Min.Location.State) +poly(Min.Location/Length,5,raw=T)+as.numeric(Max.Location/Length)+BTW(R.of.City,0,15)+BTW(R.of.County,0,15)+I(BTW(R.of.City,0,15) | BTW(R.of.County,0,15)) + BTW(Min.Location-Min.Keywords,-40,40) + I(I((is.na(Min.Location.State))) & BTW(Min.Location-Min.Keywords,-40,40)) ,data=SampleTR,family = binomial(link = "probit"))
Y.Hat= predict(Probit,newdata=SampleTS,type='response')

# Assesment
Probit.Table=table(SampleTS$Correct,1*(Y.Hat>0.5))
colnames(Probit.Table)=c('Predicted as Incorrect (Y.Hat<0.5)','Predicted as Correct (Y.Hat>0.5)')
row.names(Probit.Table)=c('Incorrect','Correct')
Probit.Table


# 2.d Lets normalize the data for the KNN and NN procedures

X0=model.frame(Probit$formula,SampleTR)
X0=as.matrix(X0)
X0[X0=="FALSE"| X0==" FALSE"]='0'
X0[X0=="TRUE" | X0==" TRUE"]='1'
X0=apply(X0,2,function(y) as.numeric(y))
X0=normalizeData(X0, type = "0_1")
X0TR=data.frame(X0)

X0=model.frame(Probit$formula,SampleTS)
X0=as.matrix(X0)
X0[X0=="FALSE"| X0==" FALSE"]='0'
X0[X0=="TRUE" | X0==" TRUE"]='1'
X0=apply(X0,2,function(y) as.numeric(y))
X0=normalizeData(X0, type = "0_1")
X0TS=data.frame(X0)

# 2.e Estimating a KNN Model (It may take several minutes)

For=paste(colnames(X0TR[,-1]),collapse='+')
KNN <- kknn(paste('X1~',For,collapse=''),train=X0TR,test=X0TS,k=10,scale=F,kernel='epanechnikov')
Y.Hat.KNN=KNN[[1]]

# Assesment
KNN.Table=table(SampleTS$Correct,1*(Y.Hat.KNN>0.5))
colnames(KNN.Table)=c('Predicted as Incorrect (Y.Hat<0.5)','Predicted as Correct (Y.Hat>0.5)')
row.names(KNN.Table)=c('Incorrect','Correct')
KNN.Table


# 2.f Estimating a NN Model (It may take several hours)
For=paste(colnames(X0TR[,-1]),collapse='+')
NN <- neuralnet(paste('X1~',For,collapse=''),data=X0TR,hidden=15,linear.output = F)
# Note: the results are already in the environment

Y.Hat.NN=compute(NN,covariate=X0TS[,-1])$net.result

# Assesment
NN.Table=table(SampleTS$Correct,1*(Y.Hat.NN>0.5))
colnames(NN.Table)=c('Predicted as Incorrect (Y.Hat<0.5)','Predicted as Correct (Y.Hat>0.5)')
row.names(NN.Table)=c('Incorrect','Correct')
NN.Table


#####################
# Graphic Analysis ##
#####################

# a) Create measures of Alpha and Coverage for each procedure
Threshold=seq(0.01,0.99,0.01)

# Probit Model
Probit.Results=lapply(Threshold,function(y){
  temp=table(SampleTS$Correct,1*(Y.Hat>y))
  Alpha=temp[1,2]/sum(temp[,2])
  Coverage=temp[2,2]/sum(temp[2,])
  return(data.frame(Threshold=y,Alpha=Alpha,Coverage=Coverage))
})
Probit.Results=rbindlist(Probit.Results)


# KNN Model
KNN.Results=lapply(Threshold,function(y){
  temp=table(SampleTS$Correct,1*(Y.Hat.KNN>y))
  Alpha=temp[1,2]/sum(temp[,2])
  Coverage=temp[2,2]/sum(temp[2,])
  return(data.frame(Threshold=y,Alpha=Alpha,Coverage=Coverage))
})
KNN.Results=rbindlist(KNN.Results)

# NN Model
NN.Results=lapply(Threshold,function(y){
  temp=table(SampleTS$Correct,1*(Y.Hat.NN>y))
  Alpha=temp[1,2]/sum(temp[,2])
  Coverage=temp[2,2]/sum(temp[2,])
  return(data.frame(Threshold=y,Alpha=Alpha,Coverage=Coverage))
})
NN.Results=rbindlist(NN.Results)




# b) Performance of Models 
par(mfrow=c(1,3))

plot(Probit.Results$Threshold,Probit.Results$Coverage,main='Probit Model | Coverage and Alpha',ylab='',xlab='Threshold',type='l',lwd=3,bty='l',col='dodgerblue4',cex.main=1.5,cex.axis=1,cex.lab=1.5,ylim=c(0,1))
abline(h=0.05,col='black')
lines(Probit.Results$Threshold,Probit.Results$Alpha,ylab='',type='l',lwd=3,bty='l',col='red')
legend('topright',legend=c('Coverage','Alpha'),bty='n',lwd = 2,col = c('dodgerblue4','red'))

plot(KNN.Results$Threshold,KNN.Results$Coverage,main='KNN Model | Coverage and Alpha',ylab='',xlab='Threshold',type='l',lwd=3,bty='l',col='dodgerblue4',cex.main=1.5,cex.axis=1,cex.lab=1.5,ylim=c(0,1))
abline(h=0.05,col='black')
lines(KNN.Results$Threshold,KNN.Results$Alpha,ylab='',type='l',lwd=3,bty='l',col='red')
legend('topright',legend=c('Coverage','Alpha'),bty='n',lwd = 2,col = c('dodgerblue4','red'))

plot(NN.Results$Threshold,NN.Results$Coverage,main='NN Model | Coverage and Alpha',ylab='',xlab='Threshold',type='l',lwd=3,bty='l',col='dodgerblue4',cex.main=1.5,cex.axis=1,cex.lab=1.5,ylim=c(0,1))
abline(h=0.05,col='black')
lines(NN.Results$Threshold,NN.Results$Alpha,ylab='',type='l',lwd=3,bty='l',col='red')
legend('topright',legend=c('Coverage','Alpha'),bty='n',lwd = 2,col = c('dodgerblue4','red'))


# c) Performance comparison across models

par(mfrow=c(1,1))
plot(Probit.Results$Alpha,Probit.Results$Coverage,main='Comparison Across Models',ylab='Coverage',xlab='Alpha',type='l',lwd=3,bty='l',col='dodgerblue4',cex.main=1.5,cex.axis=1,cex.lab=1.5,ylim=c(0,1))
lines(KNN.Results$Alpha,KNN.Results$Coverage,ylab='',type='l',lwd=3,bty='l',col='red')
lines(NN.Results$Alpha,NN.Results$Coverage,ylab='',type='l',lwd=3,bty='l',col='dark green')
legend('bottomright',legend=c('Probit','KNN','NN'),bty='n',lwd = 3,col = c('dodgerblue4','red','dark green'))

# d) Geographical Distribution
install.packages('plotly')
library('plotly')

Geography=unique(subset(Sample,Correct==1)[,1:3])
Geography=merge(Geography,City.Names,by=c('City','State'))
Geography$Place=paste(Geography$County,Geography$State,sep = ' | ')
Geography=count(Geography,'Place')
Geography=Geography[order(Geography$freq,decreasing = T),]

Predicted.Geography=unique(subset(SampleTS,Y.Hat>0.75)[,1:3])
Predicted.Geography=merge(Predicted.Geography,City.Names,by=c('City','State'))
Predicted.Geography$Place=paste(Predicted.Geography$County,Predicted.Geography$State,sep = ' | ')
Predicted.Geography=count(Predicted.Geography,'Place')
I=match(Geography$Place,Predicted.Geography$Place)
Predicted.Geography=Predicted.Geography[I,]
Predicted.Geography$freq[is.na(Predicted.Geography$freq)]=0
Predicted.Geography$Place=Geography$Place


# Plot with all the Values 
plot_ly(x=Geography$Place , y = Geography$freq/sum(Geography$freq), type = "bar",opacity = 0.9,name='Sample')  %>%
  add_trace(x=Predicted.Geography$Place, y = Predicted.Geography$freq/sum(Predicted.Geography$freq), type = "bar",opacity = 0.6,name='Predicted') %>%
  layout(barmode='overlay', showlegend = T,xaxis = list(title=' ',autotick = T,showticklabels = FALSE,tickcolor = toRGB("white")), yaxis = list(title='Distribution'))

# Note that NY is highly under-represented in the final sample, in the original code 
# we included specific variables for those cities that coincide with the name of the 
# state. This improved the accuracy drastically.

# Plot with only positive Values 
plot_ly(x=Geography$Place[Predicted.Geography$freq>0] , y = Geography$freq[Predicted.Geography$freq>0]/sum(Geography$freq), type = "bar",opacity = 0.9,name='Sample')  %>%
  add_trace(x=Predicted.Geography$Place[Predicted.Geography$freq>0], y = Predicted.Geography$freq[Predicted.Geography$freq>0]/sum(Predicted.Geography$freq), type = "bar",opacity = 0.6,name='Predicted') %>%
  layout(barmode='overlay', showlegend = T,xaxis = list(title=' ',autotick = T,showticklabels = FALSE,tickcolor = toRGB("white")), yaxis = list(title='Distribution'))



